StorageIndexer.java example

Explorer

eclipse-instasearch-master
- instasearch
  - src
    - it
      - unibz
        instasearch
        InstaSearch.java
        InstaSearchPlugin.java
        InstaSearchStartup.java
        actions
        BuildIndexActionDelegate.java
        CheckUpdatesActionDelegate.java
        CollapseAllActionDelegate.java
        DeleteIndexActionDelegate.java
        ExpandAllActionDelegate.java
        IncrementalSearchActionDelegate.java
        InstaSearchActionDelegate.java
        OpenSearchDialog.java
        PreferencesAction.java
        ShowExceptionAction.java
        ShowInstaSearchAction.java
        VisitHomePageActionDelegate.java
        indexing
        Field.java
        FileAnalyzer.java
        LengthNormSimilarity.java
        QueryAnalyzer.java
        ResourceCollector.java
        SearchQuery.java
        SearchResult.java
        SearchResultDoc.java
        Searcher.java
        StorageIndexer.java
        WorkspaceIndexer.java
        WorkspaceIndexerJDT.java
        querying
        CSVExpander.java
        CurrentProjectSetter.java
        FieldAliasConverter.java
        FileNameSearcher.java
        FilterSetter.java
        FolderSearcher.java
        LastTermQueryPrefixer.java
        LowercaseConverter.java
        ModifiedTimeConverter.java
        PhraseSearcher.java
        QueryFuzzifier.java
        QueryVisitor.java
        UppercaseNameExpander.java
        VisitableQuery.java
        WorkingSetExpander.java
        tokenizers
        CamelCaseTokenizer.java
        DotSplitTokenizer.java
        TermSplitTokenizer.java
        WordSplitTokenizer.java
        standard
        CharStream.java
        FastCharStream.java
        ParseException.java
        StandardTokenizer.java
        StandardTokenizerConstants.java
        StandardTokenizerTokenManager.java
        Token.java
        TokenMgrError.java
        jobs
        CheckUpdatesJob.java
        DeleteIndexJob.java
        IndexUpdateJob.java
        IndexingJob.java
        UpdatePluginJob.java
        prefs
        InstaSearchPreferencePage.java
        PreferenceConstants.java
        PreferenceInitializer.java
        ui
        DropdownMenuProvider.java
        InstaSearchPage.java
        InstaSearchUI.java
        InstaSearchView.java
        MatchHighlightJob.java
        ReportErrorDialog.java
        ResultContentProvider.java
        ResultLabelProvider.java
        SearchContentProposalProvider.java
        SearchJob.java
        SearchViewControl.java
        StyledTextContentAdapter.java
  - test
    - it
      - unibz
        instasearch
        indexing
        SearcherTest.java
        TestSearcher.java
        TestStorage.java

/*
 * Copyright (c) 2009 Andrejs Jermakovics.
 * 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     Andrejs Jermakovics - initial implementation
 */
package it.unibz.instasearch.indexing;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.eclipse.core.resources.IStorage;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.core.runtime.IPath;

/**
 *  Indexes documents of type IStorage 
 */
public class StorageIndexer 
{
	/** Maximum number of terms indexed per one document */
	private static final int MAX_TERMS_PER_DOC = 200000;
	/** */
	public static final String NO_VALUE = "<none>";
	/** */
	public static final int MIN_WORD_LENGTH = 1;
	protected static final FileAnalyzer fileAnalyzer = new FileAnalyzer(MIN_WORD_LENGTH);

	private IndexChangeListener changeListener = new NullIndexChangeListener();
	
	private static final Similarity similarity = new LengthNormSimilarity();
	private static final int MAX_RETRY_ATTEMPTS = 10;
	private Directory indexDir;

	/**
	 * @throws IOException 
	 * 
	 */
	public StorageIndexer() throws IOException 
	{
		checkLock();
	}

	private void checkLock() throws IOException 
	{
		Directory indexDir = getIndexDir();
		
		if( IndexWriter.isLocked(indexDir) ) // should not be locked at startup, unlock
			IndexWriter.unlock(indexDir);
	}

	public Directory getIndexDir() throws IOException 
	{
		if( indexDir == null ) indexDir = new RAMDirectory();

		return indexDir;
	}

	/**
	 * @param create index
	 * @return IndexWriter
	 * @throws IOException 
	 */
	public IndexWriter createIndexWriter(boolean create) throws IOException 
	{
		IndexWriter indexWriter = new IndexWriter(getIndexDir(), fileAnalyzer, create, MaxFieldLength.UNLIMITED);

		indexWriter.setMergeFactor(2); // use less resources (although slower)
		indexWriter.setSimilarity(similarity);
		indexWriter.setMaxFieldLength(MAX_TERMS_PER_DOC);

		return indexWriter;
	}

	/**
	 * @return isIndexed
	 * @throws IOException 
	 */
	public boolean isIndexed() throws IOException 
	{
		return IndexReader.indexExists(getIndexDir());
	}

	/**
	 * Check if the index can be read
	 * 
	 * @return whether the index is readable
	 */
	public boolean isReadable(){

		try {
			IndexReader reader = IndexReader.open(getIndexDir(), true);
			reader.close();

		} catch (IOException readingException) {
			return false;
		}

		return true;
	}

	/**
	 * Delethe the whole index
	 * @throws Exception
	 */
	public void deleteIndex() throws Exception {

		RetryingRunnable runnable = new RetryingRunnable() 
		{	
			public void run() throws Exception 
			{
				IndexWriter w = createIndexWriter(true); // open for writing and close (make empty)
				w.deleteAll();
				w.commit();
				w.close(true);
				
				Directory dir = getIndexDir();
				for(String file: dir.listAll())
				{
					if( dir.fileExists(file) ) // still exits
					{
						dir.sync(file);
						dir.deleteFile(file);
					}
				}
				dir.close();
			}
			
			public boolean handleException(Throwable e) 
			{
				return true;
			}
		};
		
		changeListener.onIndexReset(); // close searcher because index is deleted
		
		runRetryingRunnable(runnable); // delete index with retry
	}

	/**
	 * @throws Exception
	 */
	public void optimizeIndex() throws Exception {
		if( ! isIndexed() )
			return;

		IndexWriter w = createIndexWriter(false);
		w.optimize();
		w.close();

		changeListener.onIndexUpdate();
	}

	/**
	 * @param changeListener the changeListener to set
	 */
	public void setIndexChangeListener(IndexChangeListener changeListener) {
		this.changeListener = changeListener;
	}

	/**
	 * @return the changeListener
	 */
	protected IndexChangeListener getIndexChangeListener() {
		return changeListener;
	}

	/**
	 * 
	 * @param indexWriter
	 * @param storage
	 * @param projectName
	 * @param modificationStamp 
	 * @param jar path to jar file containing this file or null
	 * @throws CoreException
	 * @throws IOException
	 */
	public void indexStorage(IndexWriter indexWriter, IStorage storage, String projectName,
			long modificationStamp, String jar) throws IOException 
			{
		InputStream contents;
		try
		{
			contents = storage.getContents();
		}
		catch (Exception e)
		{
			throw new IOException(e);
		}
		BufferedReader isReader = new BufferedReader(new InputStreamReader(contents));
		IPath fullPath = storage.getFullPath();
		String ext = fullPath.getFileExtension();
		if( ext == null ) ext = NO_VALUE;

		Document doc = new Document();

		doc.add(createLuceneField(Field.CONTENTS, 	isReader));
		doc.add(createLuceneField(Field.FILE, 		fullPath.toString()));
		doc.add(createLuceneField(Field.PROJ, 		projectName));
		doc.add(createLuceneField(Field.NAME, 		fullPath.lastSegment()));
		doc.add(createLuceneField(Field.EXT, 		ext.toLowerCase(Locale.ENGLISH)));
		doc.add(createLuceneField(Field.MODIFIED, 	Long.toString(modificationStamp)));
		doc.add(createLuceneField(Field.JAR, 		(jar==null)?NO_VALUE:jar));

		indexWriter.addDocument(doc);
	}

	private static void runRetryingRunnable(RetryingRunnable runnable) throws Exception
	{
		Throwable lastException = null;
		
		for(int i = 1; i <= MAX_RETRY_ATTEMPTS; i++)
		{	
			try
			{
				runnable.run();
				lastException = null;
				break;
			} 
			catch(Throwable e) // exception during run occured
			{
				lastException = e;
				
				if( ! runnable.handleException(e) )
					break;
			}
			
			try {
				Thread.sleep( i*1000 ); // wait a bit longer each time for files to be freed
			} catch(Exception e) { 
				break;
			}
		}
		
		if( lastException != null )
		{
			if( lastException instanceof Exception )
				throw (Exception)lastException;
			else
				throw new Exception(lastException);
		}
	}
	
	/**
	 * Makes several attempts to index storage.
	 * Occasionally the index files get locked (by other processes) and are temporarily not writable.
	 * 
	 * @param indexWriter
	 * @param storage 
	 * @param projectName 
	 * @param modificationStamp 
	 * @param jar 
	 * @throws IOException 
	 */
	protected void indexStorageWithRetry(final IndexWriter indexWriter, final IStorage storage,
			final String projectName, final long modificationStamp, final String jar) throws Exception
	{
		RetryingRunnable runnable = new RetryingRunnable()
		{
			public void run() throws Exception 
			{
				indexStorage(indexWriter, storage, projectName, modificationStamp, jar);
			}
			
			public boolean handleException(Throwable e) 
			{
				if( e instanceof OutOfMemoryError )
				{
					if( indexWriter.getMaxFieldLength() > IndexWriter.DEFAULT_MAX_FIELD_LENGTH )
						indexWriter.setMaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH); // use less memory
					else 
						return false;
				}
				else if( e instanceof IOException)
				{
					changeListener.onIndexReset(); // close searcher
				}
				
				return true; // keep retrying
			}
		};
		
		runRetryingRunnable(runnable);
	}
	
	public interface RetryingRunnable
	{
		public void run() throws Exception;
		
		/**
		 * If exception occurs during run()
		 * 
		 * @param e
		 * @return true if should run again, false if stop and re-throw exception
		 */
		public boolean handleException(Throwable e);
	}
	
	public void deleteStorage(IStorage storage) throws Exception 
	{	
		IndexReader reader = IndexReader.open(getIndexDir(), false);

		String filePath = storage.getFullPath().toString();

		Term term = Field.FILE.createTerm(filePath);
		reader.deleteDocuments(term);

		reader.close();
	}


	private static org.apache.lucene.document.Field createLuceneField(Field fieldName, String value) {
		return new org.apache.lucene.document.Field(fieldName.toString(), value, 
				Store.YES, org.apache.lucene.document.Field.Index.NOT_ANALYZED);
	}

	private static org.apache.lucene.document.Field createLuceneField(Field fieldName, Reader reader) {
		return new org.apache.lucene.document.Field(fieldName.toString(), 
				reader, TermVector.YES);
	}

	/**
	 * Extracts terms from text
	 * 
	 * @param text
	 * @return a map of terms to their offsets in text
	 * @throws IOException
	 */
	public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException {
		Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>();
		TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text));
		
		TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
		OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);
		
		while(tokenStream.incrementToken())
		{
			String termText = termAtt.term().toLowerCase(Locale.ENGLISH);// t.termText().toLowerCase(Locale.ENGLISH);
			int offset = offsetAtt.startOffset();
			
			List<Integer> offsets = terms.get(termText);

			if( offsets == null ) {
				offsets = new LinkedList<Integer>();
				terms.put(termText, offsets);
			}

			offsets.add(offset);
		}
		tokenStream.close();

		return terms;
	}

	/**
	 * Listener that gets called when index has changed
	 */
	public interface IndexChangeListener
	{
		/** Index was updated with files or files were removed */
		public void onIndexUpdate();

		/** Index was reset - created or deleted */
		public void onIndexReset();
	}

	/** Empty implementation to void null checks (Null Object pattern) */
	private static class NullIndexChangeListener implements IndexChangeListener 
	{
		public void onIndexUpdate() {}
		public void onIndexReset() {}
	}

}